Loading all required library
library(tidyverse)
## -- Attaching packages --------
## v ggplot2 3.3.0 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.5
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts -----------------
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readxl)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Load our dataset from pre-determind working directory
df <- read_excel("Data Analysis - Data Sheets.xlsx", sheet = "PT & FT Data PivotTable format")
View(df)
glimpse(df)
## Observations: 1,840
## Variables: 6
## $ Cluster <chr> "Education", "Education", "Education", "Education", "Fami...
## $ Agency <chr> "Education Agency 1", "Education Agency 2", "Education Ag...
## $ Year <dbl> 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 201...
## $ `PT/FT` <chr> "Full-Time", "Full-Time", "Full-Time", "Full-Time", "Full...
## $ Gender <chr> "Female", "Female", "Female", "Female", "Female", "Female...
## $ Headcount <dbl> 180, 2463, 32, 39251, 9817, 44, 82, 3205, 308, 76, 58, 83...
Pre-analysis Check
attach(df)
names(df)
## [1] "Cluster" "Agency" "Year" "PT/FT" "Gender" "Headcount"
df$Type<- df$`PT/FT`
length(unique(Agency))
## [1] 92
any(is.character(df))
## [1] FALSE
table(df$Cluster)
##
## Education Family & Community Services
## 80 60
## Finance, Services & Innovation Health
## 40 660
## Industry Justice
## 160 280
## Planning & Environment Premier & Cabinet
## 160 220
## Transport Treasury
## 120 60
table(df$Type)
##
## Full-Time Part-Time
## 920 920
df %>%
group_by(Cluster,Year) %>%
summarise(Avg.Headcount= mean(Headcount), Max.Headcount= max(Headcount), Min.Headcount= min(Headcount))
Anaysis Part
Trend<- df %>%
select(Year,Gender, Headcount) %>%
group_by(Year, Gender) %>%
summarise(Total_headcount= sum(Headcount))
ggplotly(ggplot(Trend)+
geom_bar(mapping = aes(Year, Total_headcount, fill=Gender), stat = "Identity",
position = "dodge")+
labs(title = "Total headcount by Year"))
ggplot(Trend)+
geom_smooth(aes(Year,Total_headcount, color= Gender))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 2014
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.02
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4.0804
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : span too small. fewer
## data values than degrees of freedom.
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## 2014
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius 2.02
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 0
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 4.0804
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 2014
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 2.02
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4.0804
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : span too small. fewer
## data values than degrees of freedom.
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## 2014
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius 2.02
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 0
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 4.0804
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
df2<- df %>%
filter( Type == "Part-Time")
Clus<- df2 %>%
group_by(Cluster) %>%
summarise(Total_headcount= sum(Headcount))
Clus_18<- df2 %>%
filter(Year==2018) %>%
group_by(Cluster) %>%
summarise(Total_headcount= sum(Headcount))
ggplot(Clus_18)+
geom_bar(mapping = aes(Cluster, Total_headcount),stat="Identity",fill="blue")+
coord_flip()+
labs(title = "Total headcount among the clusters of 2018")
ggplot(Clus)+
geom_bar(mapping = aes(Cluster, Total_headcount),stat="Identity",fill="blue")+
coord_flip()+
labs(title = "Total headcount among the clusters of 4 years")
df2 %>%
count(Cluster, name = "count") %>%
arrange(desc(count)) %>%
ggplot()+
geom_bar(aes(Cluster,count), stat = "identity", fill="darkblue")+
coord_flip()
df2 %>%
group_by(Cluster) %>%
summarise(Avg_Headcount = mean(Headcount)) %>%
ggplot()+
geom_bar(aes(Cluster, Avg_Headcount), stat = "identity", fill="darkblue")+
coord_flip()+
labs(title = "Average headcount among the clusters of 4 years")
Prop<- df2 %>%
group_by(Cluster)%>%
mutate(Prop = Headcount/sum(Headcount)) %>%
ungroup()
df2 %>%
group_by(Cluster)%>%
mutate(Prop = Headcount/sum(Headcount)) %>%
ungroup() %>%
ggplot(aes(Year, Prop, color=Gender))+
geom_smooth(se=F)+
facet_wrap(~Cluster)+
scale_y_continuous(labels = scales::percent)+
labs(title = "Changes of proportions over the years among cluster and gender of Part time workers")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
df2 %>%
ggplot(aes(Year, Headcount, color=Gender))+
geom_smooth(se=F)+
facet_wrap(~Cluster)+
labs(title = "Changes of headcounts over the years among clusters and gender of Part time workers")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Projecting Number of headcount in 2025
df-> df_mod
df_mod$Cluster<- factor(df_mod$Cluster)
df_mod$Gender<- factor(df_mod$Gender)
df_mod$Type<- factor(df_mod$Type)
df_mod<- df_mod %>%
select(-Agency, -'PT/FT')
set.seed(1234)
library(caTools)
sample<- sample.split(df_mod$Headcount, SplitRatio = 0.8)
test<- subset(df_mod, sample == TRUE)
train<- subset(df_mod, sample == FALSE)
model<- lm(Headcount~., data = test)
summary(model)
##
## Call:
## lm(formula = Headcount ~ ., data = test)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7852 -975 -235 329 33639
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5122.359 100263.841 -0.051 0.959
## ClusterFamily & Community Services -5386.357 512.562 -10.509 < 2e-16
## ClusterFinance, Services & Innovation -5940.460 562.284 -10.565 < 2e-16
## ClusterHealth -5825.118 346.265 -16.823 < 2e-16
## ClusterIndustry -6378.582 403.585 -15.805 < 2e-16
## ClusterJustice -6100.311 372.517 -16.376 < 2e-16
## ClusterPlanning & Environment -6657.241 407.763 -16.326 < 2e-16
## ClusterPremier & Cabinet -6971.072 387.218 -18.003 < 2e-16
## ClusterTransport -5682.999 424.588 -13.385 < 2e-16
## ClusterTreasury -6863.252 512.603 -13.389 < 2e-16
## Year 6.444 49.735 0.130 0.897
## GenderMale -686.043 140.593 -4.880 1.17e-06
## TypePart-Time -1034.712 140.656 -7.356 3.01e-13
##
## (Intercept)
## ClusterFamily & Community Services ***
## ClusterFinance, Services & Innovation ***
## ClusterHealth ***
## ClusterIndustry ***
## ClusterJustice ***
## ClusterPlanning & Environment ***
## ClusterPremier & Cabinet ***
## ClusterTransport ***
## ClusterTreasury ***
## Year
## GenderMale ***
## TypePart-Time ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2824 on 1601 degrees of freedom
## Multiple R-squared: 0.2197, Adjusted R-squared: 0.2139
## F-statistic: 37.57 on 12 and 1601 DF, p-value: < 2.2e-16
predict<- predict(model, train)
df_mod$Pred.Headcount <- predict(model,data.frame(Year=2025,Type=df_mod$Type, Gender=df_mod$Gender, Cluster=df_mod$Cluster))
Different insights of 2025
df_mod %>%
filter(Type=="Full-Time") %>%
ggplot()+
geom_bar(aes(Cluster, Pred.Headcount, fill=Gender), stat = "identity", position = "dodge")+
coord_flip()+
labs(title = "Number of headcount in 2025 of Full-time workers ",
y="Headcount")
df_mod %>%
filter(Type=="Part-Time") %>%
ggplot()+
geom_bar(aes(Cluster, Pred.Headcount, fill=Gender), stat = "identity", position = "dodge")+
coord_flip()+
labs(title = "Number of headcount in 2025 of Part-Time workers ",
y="Headcount")
#after neglecting negetive headcount
df_mod %>%
mutate(Pred.Headcount= if_else(Pred.Headcount<0, 0, Pred.Headcount)) %>%
filter(Type=="Part-Time") %>%
ggplot()+
geom_bar(aes(Cluster, Pred.Headcount, fill=Gender), stat = "identity", position = "dodge")+
coord_flip()+
labs(title = "Number of headcount in 2025 of Part-Time workers ",
y="Headcount")